mapreduce 读写Parquet格式数据 Demo

import java.io.IOException;

import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.LongWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
import org.apache.parquet.example.data.Group;
import org.apache.parquet.example.data.simple.SimpleGroupFactory;
import org.apache.parquet.hadoop.ParquetInputFormat;
import org.apache.parquet.hadoop.ParquetOutputFormat;
import org.apache.parquet.hadoop.example.GroupReadSupport;
import org.apache.parquet.hadoop.example.GroupWriteSupport;
import org.apache.parquet.schema.MessageType;
import org.apache.parquet.schema.OriginalType;
import org.apache.parquet.schema.PrimitiveType.PrimitiveTypeName;
import org.apache.parquet.schema.Types;

/**
 * MR Parquet格式数据读写Demo
 */
public class ParquetReaderAndWriteMRDemo {

    public static void main(String[] args) throws Exception {
        Configuration conf = new Configuration();
        String[] otherargs=new GenericOptionsParser(conf, args).getRemainingArgs();
        if(otherargs.length!=3){
            System.out.println("<in> <out> 1");
            System.out.println("<parquet-in> <out> 2");
            System.out.println("<in> <parquet-out> 3");
            System.out.println("<parquet-in> <parquet-out> 4");
            System.exit(2);
        }
        //此demo 输入数据为2列     city  ip
        
        MessageType schema = Types.buildMessage() 
                   .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("city") 
                   .required(PrimitiveTypeName.BINARY).as(OriginalType.UTF8).named("ip") 
                   .named("pair");
        System.out.println("[schema]=="+schema.toString());
        GroupWriteSupport.setSchema(schema, conf);
        
        Job job = Job.getInstance(conf, "ParquetReadMR");
        job.setJarByClass(ParquetReaderAndWriteMRDemo.class);
        
        if(otherargs[2].equals("1")){
            job.setMapperClass(NormalMapper.class);
            job.setReducerClass(NormalReducer.class);
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.setInputPaths(job,otherargs[0] );
            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
            if (!job.waitForCompletion(true))
                return;
        }
        if(otherargs[2].equals("3")){
            job.setMapperClass(ParquetWriteMapper.class);
            job.setNumReduceTasks(0);
            FileInputFormat.setInputPaths(job,otherargs[0] );
            
            //parquet输出
            job.setOutputFormatClass(ParquetOutputFormat.class);
            ParquetOutputFormat.setWriteSupportClass(job, GroupWriteSupport.class);
//            ParquetOutputFormat.setOutputPath(job, new Path(otherargs[1]));
            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
            if (!job.waitForCompletion(true))
                return;
        }
        
        if(otherargs[2].equals("2")){
            //parquet输入
            job.setMapperClass(ParquetReadMapper.class);
            job.setNumReduceTasks(0);
            job.setInputFormatClass(ParquetInputFormat.class);
            ParquetInputFormat.setReadSupportClass(job, GroupReadSupport.class);
            
            job.setOutputKeyClass(Text.class);
            job.setOutputValueClass(Text.class);
            FileInputFormat.setInputPaths(job,otherargs[0] );
            FileOutputFormat.setOutputPath(job, new Path(otherargs[1]));
            if (!job.waitForCompletion(true))
                return;
        }
        if(otherargs[2].equals("4")){
            //TODO 不想写了
        }
    }
    
    public static class ParquetWriteMapper extends Mapper<LongWritable, Text, Void, Group> {
        SimpleGroupFactory factory=null;
        protected void setup(Context context) throws IOException ,InterruptedException {
            factory = new SimpleGroupFactory(GroupWriteSupport.getSchema(context.getConfiguration()));
        };
        
        public void map(LongWritable _key, Text ivalue, Context context) throws IOException, InterruptedException {
            Group pair=factory.newGroup();
            String[] strs=ivalue.toString().split("\\s+");
            pair.append("city", strs[0]);
            pair.append("ip", strs[1]);
            context.write(null,pair);
        }
    }
    
    public static class ParquetReadMapper extends Mapper<Void, Group, Text, Text> {
        public void map(Void _key, Group group, Context context) throws IOException, InterruptedException {
            String city=group.getString(0, 0);
            String ip=group.getString(1, 0);
            context.write(new Text(city),new Text(ip));
        }
    }
    
    public static class NormalMapper extends Mapper<LongWritable, Text, Text, Text> {

        public void map(LongWritable ikey, Text ivalue, Context context) throws IOException, InterruptedException {
            String[] strs=ivalue.toString().split("\\s+");
            context.write(new Text(strs[0]), new Text(strs[1]));
        }
    }
        public static class NormalReducer extends Reducer<Text, Text, Text, Text> {

            public void reduce(Text _key, Iterable<Text> values, Context context) throws IOException, InterruptedException {
                for (Text text : values) {
                    context.write(_key,text);
                }
                
            }
        }

}

 

posted @ 2017-08-18 14:38  Nucky_yang  阅读(4676)  评论(0编辑  收藏  举报